
> # FILE:     UnmaskingReplicationSI.R
> # AUTHOR:   Austin Hart & J. Scott Matthews
> # DATE:     July 14, 2021
> # SUBJECT:  REPLICATION FILES, JOURNAL OF POLITICS
> #           Supplemental Info/Online Appendix
> 
> # Notes for replication:
> #   Required packages: tidyverse, knitr, broom, cowplot, lemon, gtable
> #   Custom functions & graphical theme defined below
> 
> 
> # SETUP ---------------------------------------------------
> 
> # load packages
>   library(tidyverse)
-- Attaching packages ------------------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.3     v purrr   0.3.4
v tibble  3.0.6     v dplyr   1.0.4
v tidyr   1.1.2     v stringr 1.4.0
v readr   1.4.0     v forcats 0.5.1
-- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag()    masks stats::lag()

>   library(knitr);library(broom)

>   library(cowplot);library(lemon);library(gtable)

Attaching package: �lemon�

The following object is masked from �package:purrr�:

    %||%

The following objects are masked from �package:ggplot2�:

    CoordCartesian, element_render


> # directory (set directory below if you choose)
>   # setwd("your.path.here")
> 
> # Data for SI
>   # main analysis
>     load(url("https://www.dropbox.com/s/c9safb3z4bib6wb/HartMatthewsJOP.RData?dl=1"))

>   # rand/balance checks
>     load(url("https://www.dropbox.com/s/fsq4irn9dfwhlc3/HartMatthewsJOPSI.RData?dl=1"))

> # CUSTOM FUNCTIONS ---------------------------------------
>   
> # FctClean() - function to relabel + reorder factors 
>   FctClean <- function(facvar, ...){
+     fct_recode(fct_relevel(facvar, ...), ...)
+   }  

> # FctCaseWhen() - function to create ordered factor from case_when())    
>   FctCaseWhen = function(...) {
+     args = as.list(match.call())
+     levels = sapply(args[-1], function(f) f[[3]])  # extract RHS of formula
+     levels = levels[!is.na(levels)]
+     factor(dplyr::case_when(...), levels = levels)
+   } 

> # Custom theme for visuals
>   mytheme = theme_bw() + theme(
+     plot.title = element_text(hjust = 0, size = 10, face = 'bold'),
+     legend.title = element_text(size = 10),
+     legend.text = element_text(size = 10),
+     axis.text.x = element_text(size = 10),
+     axis.title.x = element_text(margin = unit(c(3, 0, 0, 0), "mm"), size = 10),
+     axis.text.y = element_text(size = 10), axis.title.y = element_text(margin = unit(c(0, 3, 0, 0), "mm"), size = 8),
+     panel.grid.minor = element_blank(),
+     panel.grid.major.y = element_line(color = 'gray80',linetype = 'solid',size = 0.35),
+     panel.grid.major.x = element_blank(),
+     axis.ticks = element_blank(),
+     panel.spacing = unit(2, "lines"),
+     plot.margin=unit(c(.2,.2,.2,.2),"cm"),
+     panel.border = element_blank(),
+     strip.background = element_blank(),
+     strip.text = element_text(size = 10, face = 'bold')
+   )    

> # shiftLegend - function to move legend within faceted plot  
>   shiftLegend = function(p) {
+     pnls = cowplot::plot_to_gtable(p) %>% 
+       gtable::gtable_filter("panel") %>%
+       with(setNames(grobs, layout$name)) %>% 
+       purrr::keep(~identical(.x,zeroGrob()))
+     if( length(pnls) == 0 ) stop( "No empty facets in the plot" )
+     lemon::reposition_legend( p, "center", panel=names(pnls) )
+   } 

> # PARTICIPANTS ----------------------------------
>   
> # Attention
>   fct_count(as_factor(dfsi$Attention[!is.na(dfsi$Female)]), prop = T)
# A tibble: 2 x 3
  f         n     p
  <fct> <int> <dbl>
1 0       749 0.159
2 1      3950 0.841

> # Duration
>   summary(dfsi$DurationSec/60)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.200   3.300   4.783   5.633   6.867 228.833 

> # Demographics
>   fct_count(dfsi$Female, prop = T)
# A tibble: 3 x 3
  f          n       p
  <fct>  <int>   <dbl>
1 Female  2297 0.486  
2 Other   2402 0.508  
3 <NA>      30 0.00634

>   fct_count(dfsi$RaceEth, prop = T)
# A tibble: 7 x 3
  f                                      n       p
  <fct>                              <int>   <dbl>
1 White                               3568 0.754  
2 Black or African American            422 0.0892 
3 Asian / Pacific Islander             327 0.0691 
4 Hispanic or Latino                   264 0.0558 
5 Native American or American Indian    45 0.00952
6 Other                                 66 0.0140 
7 <NA>                                  37 0.00782

>   fct_count(dfsi$Age, prop = T)
# A tibble: 8 x 3
  f                     n        p
  <fct>             <int>    <dbl>
1 1                     1 0.000211
2 18-29 years        1568 0.332   
3 30-39 years        1759 0.372   
4 40-49 years         703 0.149   
5 50-65 years         570 0.121   
6 65 years or older   108 0.0228  
7 Less than 18          1 0.000211
8 <NA>                 19 0.00402 

>   fct_count(dfsi$Edu, prop = T)
# A tibble: 8 x 3
  f                                            n       p
  <fct>                                    <int>   <dbl>
1 Less than high school                       36 0.00761
2 High school graduate or equivalent (GED)   478 0.101  
3 Some college                              1064 0.225  
4 2 year degree                              560 0.118  
5 4 year degree                             1863 0.394  
6 Professional degree                        590 0.125  
7 Doctorate                                  107 0.0226 
8 <NA>                                        31 0.00656

> # COMPREHENSION ---------------------------------
>   
> # Overall
>   dfsi %>% 
+     filter(!is.na(RulesTotal)) %>%
+     count(RulesTotal) %>%
+     mutate(prop = 100 * n/sum(n))
# A tibble: 3 x 3
  RulesTotal     n  prop
*      <dbl> <int> <dbl>
1          0   634  16.2
2          1  1808  46.3
3          2  1460  37.4

>   chisq.test(table(dfsi$RulesTotal))

	Chi-squared test for given probabilities

data:  table(dfsi$RulesTotal)
X-squared = 559.11, df = 2, p-value < 2.2e-16


> # By study
>   dfsi %>%
+     filter(!is.na(RulesTotal)) %>%
+     group_by(Study, RulesTotal) %>%
+     summarise(n = n()) %>%
+     mutate(perc = 100 * n/sum(n)) %>%
+     select(-n)%>%
+     pivot_wider(
+       names_from = Study, 
+       values_from = perc
+     ) %>%
+     knitr::kable(digits = 1, format = 'latex')
`summarise()` has grouped output by 'Study'. You can override using the `.groups` argument.

\begin{tabular}{r|r|r|r|r|r}
\hline
RulesTotal & 1 & 2 & 3 & 4 & 5\\
\hline
0 & 12.8 & 20.2 & 16.2 & 18.5 & 16.2\\
\hline
1 & 46.9 & 51.8 & 45.8 & 45.5 & 45.2\\
\hline
2 & 40.4 & 28.1 & 38.0 & 36.0 & 38.6\\
\hline
\end{tabular}

> # RANDOMIZATION ---------------------------------
>   
> # worker type
>   # Isolate types and stack
>     type = 
+       dfsi %>%
+       select(TypeA,TypeB) %>%
+       pivot_longer(
+         cols = c(TypeA,TypeB),
+         names_to = 'Worker',
+         values_to = 'Competence'
+       )

>     summary(type$Competence)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
    950    1075    1199    1200    1325    1451       2 

>   # Plot
>     pt = 
+       type %>%
+       ggplot(aes(x = Competence, ..density..)) +
+       geom_histogram(color = "gray20", fill = "gray", binwidth = 26) +
+       scale_y_continuous(limits = c(0,0.003), expand = c(0,0)) +
+       scale_x_continuous(limits = c(900,1500)) +
+       labs(title = expression("(a) " ~ mu[w] %~% U(950,1450)), 
+            y = "Density", x = "Worker Type (units/wk)") +
+       mytheme

>     rm(type)

>   # Check average output by assigned type
>     out =
+       bind_rows(
+         dfsi %>% select(type = TypeA, avg = AvgA),
+         dfsi %>% select(type = TypeB, avg = AvgB)
+       )

>     rvl = 
+       out %>%
+       ggplot(aes(x = type, y = avg)) +
+       geom_bin2d(bins = 20, show.legend = F) +
+       geom_abline(slope=1, intercept=0) +
+       geom_smooth(fill = "cornflowerblue", color = 'white', method = 'loess', se = T) +
+       coord_cartesian(ylim = c(850,1550),
+                       xlim = c(920,1480)) +
+       scale_x_continuous(breaks = c(1000,1200,1400)) +
+       scale_fill_gradient2(low="gray70", high = "gray10") +
+       labs(title = expression("(b) " ~ E(Y[w]) == mu[w]),
+            y = "Output (8 week average)", x = "Assigned worker type") +
+       mytheme

>     summary(lm(avg ~ type, data = out))    

Call:
lm(formula = avg ~ type, data = out)

Residuals:
    Min      1Q  Median      3Q     Max 
-324.57  -45.69   -0.74   45.01  357.57 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -4.683338   6.694196    -0.7    0.484    
type         1.003691   0.005539   181.2   <2e-16 ***
---
Signif. codes:  0 �***� 0.001 �**� 0.01 �*� 0.05 �.� 0.1 � � 1

Residual standard error: 77.95 on 9454 degrees of freedom
  (2 observations deleted due to missingness)
Multiple R-squared:  0.7765,	Adjusted R-squared:  0.7764 
F-statistic: 3.284e+04 on 1 and 9454 DF,  p-value: < 2.2e-16


>     rm(out)

>   # Merge plots and save
>     chx = gridExtra::grid.arrange(pt, rvl, ncol = 2)
`geom_smooth()` using formula 'y ~ x'

>     ggsave(filename = "mu_checks.pdf",
+            plot = chx,
+            width = 6,
+            height = 2.5,
+            dpi = 1000) 

> # BALANCE ---------------------------------------
>     
> # Study 1
>   b1 =
+     dfsi %>%
+     filter(Study == 1) %>%
+     aov(NoiseS1 == "Low (100)" ~ 
+           Age + Edu + RaceEth + Female + as.factor(RulesTotal), data = .) %>%
+     broom::tidy() %>%
+     .[1:5,c(1,5,6)]

> # Study 4        
>   b4 =
+     dfsi %>%
+     filter(Study == 4) %>%
+     aov(Design == 'Unmasking' ~ 
+           Age + Edu + RaceEth + Female + as.factor(RulesTotal), data = .) %>%
+     broom::tidy() %>%
+     .[1:5,c(1,5,6)]

> # Study 5  
>   b5 =
+     dfsi %>%
+     filter(Study == 5) %>%
+     aov(Design == 'Unmasking' ~ 
+           Age + Edu + RaceEth + Female + as.factor(RulesTotal), data = .) %>%
+     broom::tidy() %>%
+     .[1:5,c(1,5,6)]

> # Combine
>   left_join(b1,b4, by = 'term', suffix = c('.1','.2')) %>%
+     left_join(b5, by = 'term')
# A tibble: 5 x 7
  term                  statistic.1 p.value.1 statistic.2 p.value.2 statistic p.value
  <chr>                       <dbl>     <dbl>       <dbl>     <dbl>     <dbl>   <dbl>
1 Age                         0.974  0.421         0.661      0.620     1.50    0.201
2 Edu                         0.941  0.465         0.116      0.995     1.54    0.162
3 RaceEth                     0.291  0.918         0.522      0.760     0.663   0.652
4 Female                     13.2    0.000300      0.0483     0.826     0.776   0.379
5 as.factor(RulesTotal)       0.725  0.485         0.0274     0.973     1.61    0.201

>   rm(b1,b4,b5)

> # RETENTION -------------------------------------
>  
> # Figure 2, SI: baseline response to production
>   # Stack and clean for graphing retention functions
>     df =
+       dfmain %>%
+       pivot_longer(
+         cols = c(AvgPre,AvgPost),
+         names_to = 'Period',
+         values_to = 'Production'
+       ) %>%
+       transmute(
+         VoteB,
+         Production,
+         Period = FctClean(
+           Period,
+           '"B"^"+"' = "AvgPre",
+           '"B"["post"]' = "AvgPost"
+         ),
+         Sfac = FctCaseWhen(
+           Study == 1 ~ '1. Unmasking (n = 849)',
+           Study == 2 ~ '2. Recognition (n = 368)',
+           Study == 3 ~ '3. Unmasking (n = 550)',
+           Study == 4 ~ '4. Unm/Rec (n = 720)',
+           Study == 5 ~ '5. Unm/Rec (n = 1376)'
+         )
+       ) %>%
+       group_by(Sfac,Period) %>%
+       mutate(
+         ProdZ = (Production - mean(Production))/sd(Production)
+       ) %>%
+       ungroup()

>     glabs = parse(text = c('"Pre-Incumbent, F"["pre"]',
+                            '"Incumbent tenure, F"["post"]'))

>   # Plot retention functions    
>     lp = 
+       df %>%
+       ggplot( aes(y = VoteB, x = Production) ) +
+       facet_wrap( vars(Sfac), nrow = 2 ) +
+       stat_smooth(
+         aes( linetype = Period, color = Period ),
+         method = 'loess', span = .94, se = FALSE
+       ) +
+       scale_color_manual( values = c( 'black','gray39' ), labels = glabs ) +
+       scale_linetype_manual( values = c( 'dotted','solid' ), labels = glabs ) +
+       coord_cartesian( xlim = c(700,1700), ylim = c(0,1)) +
+       scale_x_continuous( breaks = c(950,1200,1450) ) +
+       scale_y_continuous( limits = c(-1,2), expand = c(0,.01),
+                           breaks = c(0,0.5,1) ) +
+       labs(
+         y = 'Proportion retaining incumbent',
+         x = 'Factory output (units/worker/week)',
+         linetype = 'Production Period',
+         color = 'Production Period'
+       ) +
+       mytheme 

>   # Shift legend to open facet  
>     lpf = shiftLegend(lp)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

>   # Save final figure    
>     ggsave(filename = "baseline.png", plot = lpf, width = 6, height = 3)

>     rm(lp,lpf,glabs,df)

> # MODEL ESTIMATES -------------------------------
>   
> # Full Equation 1 estimates (Table 3, SI)
>   # Isolate data
>     df =
+       dfmain %>%
+       select(VoteB,AvgPre,AvgPost,Study,Design)

>   # Estimated performance vote by study
>     Base = 
+       df %>%
+       split( f = list(.$Study,.$Design), drop = TRUE ) %>%
+       map_df(
+         ~ tidy(lm(VoteB ~ AvgPre + AvgPost, data = .x)),
+         .id = 'Study'
+       ) 

>   # Primary estimates  
>     Parms =
+       Base %>%
+       select(-std.error,-statistic) %>%
+       filter(term != '(Intercept)') %>%
+       pivot_wider(
+         values_from = c(estimate,p.value),
+         names_from = c(term)
+       ) %>%
+       mutate_at(
+         vars(starts_with('estimate')),
+         ~.*100
+       ) %>%
+       mutate_at(
+         vars(starts_with('p.value')),
+         ~.*0.5 # 1-tailed test
+       ) %>%
+       arrange(Study) %>%
+       .[, c(1, 2, 4, 3, 5)] # reorder columns

>   # Intercept
>     Int = 
+       Base %>%
+       filter(term == '(Intercept)') %>%
+       select(Study,estimate,std.error) %>%
+       arrange(Study) %>%
+       rename(Intercept = estimate)

>   # Join
>     left_join(Parms,Int) %>%
+       .[c(1,2,3,5,4,7,6),]
Joining, by = "Study"
# A tibble: 7 x 7
  Study estimate_AvgPre p.value_AvgPre estimate_AvgPost p.value_AvgPost Intercept
  <chr>           <dbl>          <dbl>            <dbl>           <dbl>     <dbl>
1 1.Un~         -0.0929       3.74e-19            0.135        1.18e-21    0.251 
2 2.Re~         -0.0554       2.15e- 6            0.123        3.83e-19   -0.105 
3 3.Un~         -0.114        9.09e- 7            0.158        6.98e- 8    0.211 
4 4.Un~         -0.129        1.89e-20            0.151        4.30e-16    0.437 
5 4.Re~         -0.0650       2.77e- 6            0.153        5.16e-22   -0.410 
6 5.Un~         -0.138        4.06e-26            0.192        5.09e-29    0.0116
7 5.Re~         -0.0466       3.63e- 5            0.156        1.04e-37   -0.695 
# ... with 1 more variable: std.error <dbl>

>     rm(df,Base,Parms,Int)

> # Alternate, using assigned types   
>   # Isolate data
>     df =
+       dfmain %>%
+       select(VoteB,TypeA,TypeB,Study,Design)

>   # Estimated performance vote by study
>     Base = 
+       df %>%
+       split( f = list(.$Study,.$Design), drop = TRUE ) %>%
+       map_df(
+         ~ tidy(lm(VoteB ~ TypeA + TypeB, data = .x)),
+         .id = 'Study'
+       ) 

>   # Primary estimates  
>     Parms =
+       Base %>%
+       select(-std.error,-statistic) %>%
+       filter(term != '(Intercept)') %>%
+       pivot_wider(
+         values_from = c(estimate,p.value),
+         names_from = c(term)
+       ) %>%
+       mutate_at(
+         vars(starts_with('estimate')),
+         ~.*100
+       ) %>%
+       mutate_at(
+         vars(starts_with('p.value')),
+         ~.*0.5 # 1-tailed test
+       ) %>%
+       arrange(Study) %>%
+       .[, c(1, 2, 4, 3, 5)] # reorder columns

>   # Intercept
>     Int = 
+       Base %>%
+       filter(term == '(Intercept)') %>%
+       select(Study,estimate,std.error) %>%
+       arrange(Study) %>%
+       rename(Intercept = estimate)

>   # Join
>     left_join(Parms,Int) %>%
+       .[c(1,2,3,5,4,7,6),]
Joining, by = "Study"
# A tibble: 7 x 7
  Study      estimate_TypeA p.value_TypeA estimate_TypeB p.value_TypeB Intercept std.error
  <chr>               <dbl>         <dbl>          <dbl>         <dbl>     <dbl>     <dbl>
1 1.Unmaski~        -0.0314  0.000747             0.0758      5.78e-15    0.225      0.165
2 2.Recogni~        -0.0580  0.0000483            0.123       8.31e-15   -0.0637     0.264
3 3.Unmaski~        -0.0114  0.187                0.0637      1.92e- 7    0.118      0.208
4 4.Unmaski~        -0.0733  0.0000000506         0.106       2.31e-15    0.307      0.231
5 4.Recogni~        -0.0804  0.00000350           0.137       3.05e-13   -0.0432     0.329
6 5.Unmaski~        -0.0453  0.00000415           0.107       5.08e-23   -0.0838     0.170
7 5.Recogni~        -0.0411  0.00168              0.151       4.34e-24   -0.702      0.236

>     rm(df,Base,Parms,Int)
Warning messages:
1: Removed 2 rows containing non-finite values (stat_bin). 
2: Removed 2 rows containing missing values (geom_bar). 
3: Removed 2 rows containing non-finite values (stat_bin2d). 
4: Removed 2 rows containing non-finite values (stat_smooth). 
